{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook, we showcase how to use the KVpress pipelines by answering questions about NVIDIA Wikipedia article."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "import torch\n",
    "from transformers import pipeline\n",
    "\n",
    "from kvpress import (\n",
    "    ExpectedAttentionPress,\n",
    "    KnormPress,\n",
    "    ObservedAttentionPress,\n",
    "    RandomPress,\n",
    "    SnapKVPress,\n",
    "    StreamingLLMPress,\n",
    "    TOVAPress,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the pipeline and data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ddb9b6e0a6134ed7b30653aad4b55615",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Load pipeline\n",
    "\n",
    "device = \"cuda:0\"\n",
    "ckpt = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
    "# ckpt = \"microsoft/Phi-3.5-mini-instruct\"\n",
    "# ckpt = \"mistralai/Mistral-Nemo-Instruct-2407\"\n",
    "attn_implementation = \"flash_attention_2\"  # use \"eager\" for ObservedAttentionPress and \"sdpa\" if you can't use \"flash_attention_2\"\n",
    "pipe = pipeline(\"kv-press-text-generation\", model=ckpt, device=device, torch_dtype=\"auto\", model_kwargs={\"attn_implementation\":attn_implementation})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of tokens: 8747\n"
     ]
    }
   ],
   "source": [
    "# Load data\n",
    "url = \"https://en.wikipedia.org/wiki/Nvidia\"\n",
    "content = requests.get(url).content\n",
    "soup = BeautifulSoup(content, \"html.parser\")\n",
    "context = \"\".join([p.text for p in soup.find_all(\"p\")]) + \"\\n\\n\"\n",
    "tokens = pipe.tokenizer.encode(context, return_tensors=\"pt\").to(device)\n",
    "print(f\"Number of tokens: {tokens.size(1)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use the pipeline with a press"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pick a press with a compression ratio, you can run the following cells with different presses\n",
    "compression_ratio = 0.5\n",
    "press = ExpectedAttentionPress(compression_ratio)\n",
    "# press = KnormPress(compression_ratio)\n",
    "# press = RandomPress(compression_ratio)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:   Complete this sentence: The Nvidia GeForce Partner Program was a ...\n",
      "Answer:     marketing program designed to provide partnering companies with benefits such as public relations support, video game bundling, and marketing development funds.\n",
      "Prediction: marketing program designed to provide partnering companies with benefits such as public relations support, video game bundling, and marketing development funds.\n"
     ]
    }
   ],
   "source": [
    "# Run the pipeline on a single question\n",
    "\n",
    "question = \"Complete this sentence: The Nvidia GeForce Partner Program was a ...\"\n",
    "true_answer = \"marketing program designed to provide partnering companies with benefits such as public relations support, video game bundling, and marketing development funds.\"\n",
    "pred_answer = pipe(context, question=question, press=press)[\"answer\"]\n",
    "\n",
    "print(f\"Question:   {question}\")\n",
    "print(f\"Answer:     {true_answer}\")\n",
    "print(f\"Prediction: {pred_answer}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:   What happened on March 1, 2024?\n",
      "Answer:     Nvidia became the third company in the history of the United States to close with a market capitalization in excess of $2 trillion\n",
      "Prediction: Nvidia became the third company in U.S. history to close with a market capitalization of over $2 trillion.\n",
      "\n",
      "Question:   What was the unofficial company motto of Nvidia during the early days?\n",
      "Answer:     Our company is thirty days from going out of business\n",
      "Prediction: The unofficial company motto of Nvidia during the early days was \"Thirty days from bankruptcy.\"\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Run the pipeline on multiple questions, the context will be compressed only once\n",
    "\n",
    "questions = [\n",
    "    \"What happened on March 1, 2024?\",\n",
    "    \"What was the unofficial company motto of Nvidia during the early days?\",\n",
    "]\n",
    "\n",
    "true_answers = [\n",
    "    \"Nvidia became the third company in the history of the United States to close with a market capitalization in excess of $2 trillion\",\n",
    "    \"Our company is thirty days from going out of business\",\n",
    "]\n",
    "\n",
    "pred_answers = pipe(context, questions=questions, press=press)[\"answers\"]\n",
    "for question, pred_answer, true_answer in zip(questions, pred_answers, true_answers):\n",
    "    print(f\"Question:   {question}\")\n",
    "    print(f\"Answer:     {true_answer}\")\n",
    "    print(f\"Prediction: {pred_answer}\")\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:              What is GTC ?\n",
      "Answer:                Nvidia's GPU Technology Conference (GTC) is a series of technical conferences held around the world.\n",
      "Prediction w/o prefix: GTC stands for GPU Technology Conference. It is a series of technical conferences held by Nvidia, a multinational technology company that specializes in graphics processing units (\n",
      "Prediction w/ prefix : Come on you don't know GTC ? Everyone knows GTC. GTC stands for GPU Technology Conference. It is a series of technical conferences held by Nvidia, a multinational technology company that specializes in\n"
     ]
    }
   ],
   "source": [
    "# Use an answer prefix and limit the number of tokens in the answer\n",
    "\n",
    "question = \"What is GTC ?\"\n",
    "true_answer = \"Nvidia's GPU Technology Conference (GTC) is a series of technical conferences held around the world.\"\n",
    "answer_prefix = \"Come on you don't know GTC ? Everyone\"\n",
    "max_new_tokens = 30\n",
    "\n",
    "pred_answer_with_prefix = pipe(context, question=question, answer_prefix=answer_prefix, press=press, max_new_tokens=max_new_tokens)[\"answer\"]\n",
    "pred_answer_without_prefix = pipe(context, question=question, press=press, max_new_tokens=max_new_tokens)[\"answer\"]\n",
    "\n",
    "print(f\"Question:              {question}\")\n",
    "print(f\"Answer:                {true_answer}\")\n",
    "print(f\"Prediction w/o prefix: {pred_answer_without_prefix}\")\n",
    "print(f\"Prediction w/ prefix : {answer_prefix + pred_answer_with_prefix}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:         Complete this sentence: In April 2016, Nvidia produced the DGX-1 based on an 8 GPU cluster,\n",
      "Answer:           to improve the ability of users to use deep learning by combining GPUs with integrated deep learning software\n",
      "Prediction w/ Q:  to improve the ability of users to use deep learning by combining GPUs with integrated deep learning software.\n",
      "Prediction w/o Q: In April 2016, Nvidia produced the DGX-1 based on an 8 GPU cluster, which was the first commercially available deep learning system.\n"
     ]
    }
   ],
   "source": [
    "# SnapKV use the latest queries to prune the KV-cache. It's hence more efficient if we include the question during compression as the latest queries will correspond to the question.\n",
    "# However it implies also implies that SnapKV cannot compress well the context independently of the question (e.g. as in a chat use case)\n",
    "\n",
    "\n",
    "question = \"Complete this sentence: In April 2016, Nvidia produced the DGX-1 based on an 8 GPU cluster,\"\n",
    "true_answer = (\n",
    "    \"to improve the ability of users to use deep learning by combining GPUs with integrated deep learning software\"\n",
    ")\n",
    "\n",
    "press = SnapKVPress(compression_ratio=0.8)\n",
    "\n",
    "pred_answer_with_question = pipe(context + question, press=press)[\"answer\"]\n",
    "pred_answer_without_question = pipe(context, question=question, press=press)[\"answer\"]\n",
    "\n",
    "print(f\"Question:         {question}\")\n",
    "print(f\"Answer:           {true_answer}\")\n",
    "print(f\"Prediction w/ Q:  {pred_answer_with_question}\")\n",
    "print(f\"Prediction w/o Q: {pred_answer_without_question}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
